import csv
import os
import time
import pickle
import pandas as pd
import numpy as np
from keras import models, layers
from keras.applications.regnet import decode_predictions
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.layers import Convolution1D, MaxPool1D, Flatten
from keras.preprocessing.text import Tokenizer
from keras_preprocessing import sequence
from keras.callbacks import EarlyStopping
from keras.models import load_model
from keras.models import Sequential

# 解决中文显示问题
plt.rcParams['font.sans-serif'] = ['KaiTi']  #指定默认字体 SimHei黑体
plt.rcParams['axes.unicode_minus'] = False   #解决保存图像是负号'

# GPU处理 读者如果是CPU注释该部分代码即可
# 指定每个GPU进程中使用显存的上限 0.9表示可以使用GPU 90%的资源进行训练
# os.environ["CUDA_DEVICES_ORDER"] = "PCI_BUS_IS"
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
# gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.8)
# sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
start =  time.perf_counter()
# 获取数据
train_df = pd.read_csv('train_cc.csv')
test_df = pd.read_csv('test_cc.csv',encoding='gb2312')
val_df = pd.read_csv('val_cc.csv',encoding='gb2312')

# 对数据集的标签数据编码
train_y = train_df.label
test_y = test_df.label
val_y = val_df.label


le = LabelEncoder()
train_y = le.fit_transform(train_y).reshape(-1, 1)
val_y = le.transform(val_y).reshape(-1, 1)
test_y = le.transform(test_y).reshape(-1, 1)


# one-hot编码
ohe = OneHotEncoder()
train_y = ohe.fit_transform(train_y).toarray()
val_y = ohe.transform(val_y).toarray()
test_y = ohe.transform(test_y).toarray()
# x = ['1.5 调整 市场 计提 2023.12 福利费']
# 使用Tokenizer对词组进行编码
max_words = 1400
max_len = 25


tok = Tokenizer(num_words=max_words)
# 防止语料中存在数字str处理
train_content = [str(a) for a in train_df.content.tolist()]
val_content = [str(a) for a in val_df.content.tolist()]
test_content = [str(a) for a in test_df.content.tolist()]
# print(test_content)
tok.fit_on_texts(train_content)

# 保存训练好的Tokenizer和导入
with open('tok.pickle', 'wb') as handle: #saving
    pickle.dump(tok, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('tok.pickle', 'rb') as handle: #loading
    tok = pickle.load(handle)

# 使用word_index属性查看每个词对应的编码
# 使用word_counts属性查看每个词对应的频数
print(tok.word_index.items())
for ii, iterm in enumerate(tok.word_index.items()):
    if ii < 10:
        print(iterm)
    else:
        break
print("===================")
for ii, iterm in enumerate(tok.word_counts.items()):
    if ii < 10:
        print(iterm)
    else:
        break
# 数据转化为序列
# 使用sequence.pad_sequences()将每个序列调整为相同的长度
# 对每个词编码之后，每句新闻中的每个词就可以用对应的编码表示，即每条新闻可以转变成一个向量了

train_seq = tok.texts_to_sequences(train_content)
val_seq = tok.texts_to_sequences(val_content)
test_seq = tok.texts_to_sequences(test_content)

# 将每个序列调整为相同的长度
train_seq_mat = sequence.pad_sequences(train_seq,maxlen=max_len)
val_seq_mat = sequence.pad_sequences(val_seq,maxlen=max_len)
test_seq_mat = sequence.pad_sequences(test_seq,maxlen=max_len)
# print(test_seq_mat)

# 建立CNN模型

# 类别为13个
num_labels = 55
inputs = Input(name='inputs', shape=[max_len], dtype='float64')
# 词嵌入使用预训练的词向量
layer = Embedding(max_words+1, 128, input_length=max_len, trainable=False)(inputs)
# 卷积层和池化层(词窗大小为3 128核)
cnn = Convolution1D(16, 3, padding='same', strides = 1, activation='relu')(layer)
cnn = MaxPool1D(pool_size=4)(cnn)
# Dropout防止过拟合
flat = Flatten()(cnn)
drop = Dropout(0.3)(flat)
# 全连接层
main_output = Dense(num_labels, activation='softmax')(drop)
model = Model(inputs=inputs, outputs=main_output)

# 优化函数 评价指标
model.summary()
model.compile(loss="categorical_crossentropy",
              optimizer='adam',      # RMSprop()
              metrics=["accuracy"])

# 模型训练和预测
# 先设置为train训练 再设置为test测试
flag = "trai"
if flag == "train":
    print("模型训练")
    # 模型训练 当val-loss不再提升时停止训练 0.0001
    model_fit = model.fit(train_seq_mat, train_y, batch_size=8, epochs=50,
                          validation_split=0.2,validation_data=(val_seq_mat,val_y),
                          callbacks=[EarlyStopping(monitor='val_loss', min_delta=0.0001,patience=2, restore_best_weights=True)])
    # 保存模型
    model.save('my_model.h5')
    del model  # deletes the existing model
    # 计算时间
    elapsed = (time.perf_counter() - start)
    print("Time used:", elapsed)
    print(model_fit.history)

else:

    # model = load_model('my_model.h5')
    # # 对测试集进行预测
    # test_pre = model.predict(test_seq_mat)
    # print('看看', np.argmax(test_pre, axis=1))
    # confm = metrics.confusion_matrix(np.argmax(test_y,axis=1),np.argmax(test_pre,axis=1))
    # print(confm)
    print("模型预测")
    # 导入已经训练好的模型
    model = load_model('my_model.h5')
    # 对测试集进行预测
    test_pre = model.predict(test_seq_mat)
    print(test_pre[0])
    # 评价预测效果，计算混淆矩阵
    confm = metrics.confusion_matrix(np.argmax(test_y, axis=1), np.argmax(test_pre, axis=1))
    reader = np.argmax(test_pre, axis=1)

    labels = []
    contents = []
    print()
    for i in range(len(reader)):
        labels.append(le.inverse_transform([reader[i]]))
        contents.append(le.inverse_transform([reader[i]]))
    with open('oo.csv', 'w', newline='', encoding='utf8') as f:
        writer = csv.writer(f)
        writer.writerow(['label', 'content'])

        for i in range(len(labels)):
            writer.writerow([labels[i], contents[i]])

    # # 混淆矩阵可视化
    # Labname = ['1']*55
    # print(metrics.classification_report(np.argmax(test_y, axis=1), np.argmax(test_pre, axis=1)))
    # plt.figure(figsize=(8, 8))
    # sns.heatmap(confm.T, square=True, annot=True,
    #             fmt='d', cbar=False, linewidths=.6,
    #             cmap="YlGnBu")
    # plt.xlabel('True label', size=14)
    # plt.ylabel('Predicted label', size=14)
    # plt.xticks(np.arange(55) + 0.05, Labname, size=12)
    # plt.yticks(np.arange(55) + 0.05, Labname, size=12)
    # plt.savefig('result.png')
    # plt.show()
    #
    # # ----------------------------------第七 验证算法--------------------------
    # # 使用tok对验证数据集重新预处理，并使用训练好的模型进行预测
    # val_seq = tok.texts_to_sequences(val_df.cutword)
    # # 将每个序列调整为相同的长度
    # val_seq_mat = sequence.pad_sequences(val_seq, maxlen=max_len)
    # # 对验证集进行预测
    # val_pre = model.predict(val_seq_mat)
    # print(metrics.classification_report(np.argmax(val_y, axis=1), np.argmax(val_pre, axis=1)))
    #
    # # 计算时间
    elapsed = (time.perf_counter() - start)
    print("Time used:", elapsed)

